chore(tpch): Create one folder for each scale_factor#3427
chore(tpch): Create one folder for each scale_factor#3427dangotbanned merged 16 commits intotpch/refactor-clifrom
scale_factor#3427Conversation
| # Table names used to construct paths dynamically | ||
| TBL_LINEITEM = "lineitem" | ||
| TBL_REGION = "region" | ||
| TBL_NATION = "nation" | ||
| TBL_SUPPLIER = "supplier" | ||
| TBL_PART = "part" |
There was a problem hiding this comment.
Since the names of the constants are all longer than the values ...
This can give us autocomplete and shrink the diff 🙂
Show diff
diff --git a/tpch/classes.py b/tpch/classes.py
index 4fc778a4d..d961e3ac6 100644
--- a/tpch/classes.py
+++ b/tpch/classes.py
@@ -15,6 +15,7 @@ from tpch.constants import (
LOGGER_NAME,
QUERIES_PACKAGE,
QUERY_IDS,
+ DBTableName,
_scale_factor_dir,
)
@@ -57,11 +58,11 @@ class Backend:
class Query:
id: QueryID
- table_names: tuple[str, ...]
+ table_names: tuple[DBTableName, ...]
_into_xfails: tuple[tuple[Predicate, str, XFailRaises], ...]
_into_skips: tuple[tuple[Predicate, str], ...]
- def __init__(self, query_id: QueryID, table_names: tuple[str, ...]) -> None:
+ def __init__(self, query_id: QueryID, table_names: tuple[DBTableName, ...]) -> None:
self.id = query_id
self.table_names = table_names
self._into_xfails = ()
diff --git a/tpch/constants.py b/tpch/constants.py
index e5d120839..91de69cf2 100644
--- a/tpch/constants.py
+++ b/tpch/constants.py
@@ -4,7 +4,7 @@ from functools import cache
from pathlib import Path
from typing import TYPE_CHECKING, get_args
-from tpch.typing_ import Artifact, QueryID
+from tpch.typing_ import Artifact, DBTableName, QueryID
if TYPE_CHECKING:
from collections.abc import Mapping
@@ -20,16 +20,7 @@ def _scale_factor_dir(scale_factor: float) -> Path:
return DATA_DIR / f"sf{scale_factor}"
-DATABASE_TABLE_NAMES = (
- "lineitem",
- "customer",
- "nation",
- "orders",
- "part",
- "partsupp",
- "region",
- "supplier",
-)
+DATABASE_TABLE_NAMES = get_args(DBTableName)
QUERY_IDS: tuple[QueryID, ...] = get_args(QueryID)
GLOBS: Mapping[Artifact, str] = {
"database": r"*[!0-9].parquet",
diff --git a/tpch/tests/conftest.py b/tpch/tests/conftest.py
index cfdca7e6d..8543e0bf1 100644
--- a/tpch/tests/conftest.py
+++ b/tpch/tests/conftest.py
@@ -12,17 +12,8 @@ from tpch.constants import _scale_factor_dir
if TYPE_CHECKING:
from collections.abc import Iterator
- from tpch.typing_ import QueryID
-
-# Table names used to construct paths dynamically
-TBL_LINEITEM = "lineitem"
-TBL_REGION = "region"
-TBL_NATION = "nation"
-TBL_SUPPLIER = "supplier"
-TBL_PART = "part"
-TBL_PARTSUPP = "partsupp"
-TBL_ORDERS = "orders"
-TBL_CUSTOMER = "customer"
+ from tpch.typing_ import DBTableName, QueryID
+
SCALE_FACTORS_BLESSED = frozenset(
(1.0, 10.0, 30.0, 100.0, 300.0, 1_000.0, 3_000.0, 10_000.0, 30_000.0, 100_000.0)
@@ -135,7 +126,7 @@ def backend(request: pytest.FixtureRequest) -> Backend:
return result
-def q(query_id: QueryID, *table_names: str) -> Query:
+def q(query_id: QueryID, *table_names: DBTableName) -> Query:
"""Create a Query with table names (paths resolved at runtime based on scale_factor)."""
return Query(query_id, table_names)
@@ -143,51 +134,26 @@ def q(query_id: QueryID, *table_names: str) -> Query:
def iter_queries() -> Iterator[Query]:
safe = SCALE_FACTORS_BLESSED | SCALE_FACTORS_QUITE_SAFE
yield from (
- q("q1", TBL_LINEITEM),
- q("q2", TBL_REGION, TBL_NATION, TBL_SUPPLIER, TBL_PART, TBL_PARTSUPP),
- q("q3", TBL_CUSTOMER, TBL_LINEITEM, TBL_ORDERS),
- q("q4", TBL_LINEITEM, TBL_ORDERS),
- q(
- "q5",
- TBL_REGION,
- TBL_NATION,
- TBL_CUSTOMER,
- TBL_LINEITEM,
- TBL_ORDERS,
- TBL_SUPPLIER,
- ),
- q("q6", TBL_LINEITEM),
- q("q7", TBL_NATION, TBL_CUSTOMER, TBL_LINEITEM, TBL_ORDERS, TBL_SUPPLIER),
- q(
- "q8",
- TBL_PART,
- TBL_SUPPLIER,
- TBL_LINEITEM,
- TBL_ORDERS,
- TBL_CUSTOMER,
- TBL_NATION,
- TBL_REGION,
- ),
- q(
- "q9",
- TBL_PART,
- TBL_PARTSUPP,
- TBL_NATION,
- TBL_LINEITEM,
- TBL_ORDERS,
- TBL_SUPPLIER,
- ),
- q("q10", TBL_CUSTOMER, TBL_NATION, TBL_LINEITEM, TBL_ORDERS),
- q("q11", TBL_NATION, TBL_PARTSUPP, TBL_SUPPLIER).with_skip(
+ q("q1", "lineitem"),
+ q("q2", "region", "nation", "supplier", "part", "partsupp"),
+ q("q3", "customer", "lineitem", "orders"),
+ q("q4", "lineitem", "orders"),
+ q("q5", "region", "nation", "customer", "lineitem", "orders", "supplier"),
+ q("q6", "lineitem"),
+ q("q7", "nation", "customer", "lineitem", "orders", "supplier"),
+ q("q8", "part", "supplier", "lineitem", "orders", "customer", "nation", "region"),
+ q("q9", "part", "partsupp", "nation", "lineitem", "orders", "supplier"),
+ q("q10", "customer", "nation", "lineitem", "orders"),
+ q("q11", "nation", "partsupp", "supplier").with_skip(
lambda _, scale_factor: scale_factor not in safe,
reason="https://github.com/duckdb/duckdb/issues/17965",
),
- q("q12", TBL_LINEITEM, TBL_ORDERS),
- q("q13", TBL_CUSTOMER, TBL_ORDERS),
- q("q14", TBL_LINEITEM, TBL_PART),
- q("q15", TBL_LINEITEM, TBL_SUPPLIER),
- q("q16", TBL_PART, TBL_PARTSUPP, TBL_SUPPLIER),
- q("q17", TBL_LINEITEM, TBL_PART)
+ q("q12", "lineitem", "orders"),
+ q("q13", "customer", "orders"),
+ q("q14", "lineitem", "part"),
+ q("q15", "lineitem", "supplier"),
+ q("q16", "part", "partsupp", "supplier"),
+ q("q17", "lineitem", "part")
.with_xfail(
lambda _, scale_factor: (scale_factor < 0.014),
reason="Generated dataset is too small, leading to 0 rows after the first two filters in `query1`.",
@@ -196,13 +162,13 @@ def iter_queries() -> Iterator[Query]:
lambda _, scale_factor: scale_factor not in safe,
reason="Non-deterministic fails for `duckdb`, `sqlframe`. All other always fail, except `pyarrow` which always passes 🤯.",
),
- q("q18", TBL_CUSTOMER, TBL_LINEITEM, TBL_ORDERS),
- q("q19", TBL_LINEITEM, TBL_PART),
- q("q20", TBL_PART, TBL_PARTSUPP, TBL_NATION, TBL_LINEITEM, TBL_SUPPLIER),
- q("q21", TBL_LINEITEM, TBL_NATION, TBL_ORDERS, TBL_SUPPLIER).with_skip(
+ q("q18", "customer", "lineitem", "orders"),
+ q("q19", "lineitem", "part"),
+ q("q20", "part", "partsupp", "nation", "lineitem", "supplier"),
+ q("q21", "lineitem", "nation", "orders", "supplier").with_skip(
lambda _, scale_factor: scale_factor not in safe, reason="Off-by-1 error"
),
- q("q22", TBL_CUSTOMER, TBL_ORDERS),
+ q("q22", "customer", "orders"),
)
diff --git a/tpch/typing_.py b/tpch/typing_.py
index b6c301045..59bfdcb6b 100644
--- a/tpch/typing_.py
+++ b/tpch/typing_.py
@@ -39,6 +39,10 @@ QueryID: TypeAlias = Literal[
"q21",
"q22",
]
+DBTableName: TypeAlias = Literal[
+ "lineitem", "customer", "nation", "orders", "part", "partsupp", "region", "supplier"
+]
+"""Table names used to construct paths dynamically."""
XFailRaises: TypeAlias = type[BaseException] | tuple[type[BaseException], ...]
Artifact: TypeAlias = Literal["database", "answers"]
There was a problem hiding this comment.
@FBruzzesi I'm happy to punt (#3427 (comment)) for now.
I just wanted to get your take on (#3427 (comment)) first before merging
Since what I proposed would mean we don't add these constants - it would make sense to include it here where they are changes anyway
| parser.addoption( | ||
| "--scale-factor", |
There was a problem hiding this comment.
So I've just tested this out and can confirm it works!
But, the downsides I'm seeing are:
1
When I run this, I need to go down 234 lines! before I see the descriptions:
$ cd tpch
$ pytest --help
...
...
Custom options:
...
...2
When I run this, I don't get any log output to let me know that generate_data.py ran 😢:
$ pytest --scale-factor=0.1There was a problem hiding this comment.
One way I'm thinking this could be improved is that we have a tpch entrypoint - and run commands through that.
For example, say I'm in the root of the narwhals repo.
Show help/list commands:
$ uv run tpch
Commands:
benchmark
generate
<TO BE DECIDED>
remove/clear/purge/reset/idkExactly the same as tpch/generate_data.py:
$ uv run tpch generate [OPTIONS]Run pytest, but first go through generate to check we have the data.
I'm sure there's like 9464983 different ways to call into pytest 😂:
$ uv run tpch <TO BE DECIDED> [OPTIONS]This would be the interface for (#805), if it makes sense to distinguish codspeed and pytest:
$ uv run tpch benchmark [OPTIONS]A side benefit would be de-duplicating these two guys 😉:
generate_data.scale_factor_exists, conftest._scale_factor_data_existsThere was a problem hiding this comment.
Thanks for the feedback @dangotbanned
RE: #3427 (comment)
- We can internally document it. That's how pytest shows custom options
- Sure I will configure the logger in
pytest_configure
RE: #3427 (comment)
Not fully sure how I feel about it, I might need to sleep on this a bit more
There was a problem hiding this comment.
After 1e15604 (if the scale factor needs to be generated):
pytest --scale-factor=0.1
2026-01-30 10:56:58,084 [INFO] Connecting to in-memory DuckDB database
2026-01-30 10:56:58,089 [INFO] Installing DuckDB TPC-H Extension
2026-01-30 10:56:58,104 [INFO] Generating data for scale_factor=0.1
2026-01-30 10:56:58,663 [INFO] Finished generating data.
2026-01-30 10:56:58,663 [INFO] Writing data to: /Users/francesco.bruzzesi/Desktop/oss/narwhals-dev/tpch/data/sf0.1
2026-01-30 10:56:58,663 [INFO] ┌──────────────────┬────────────┐
2026-01-30 10:56:58,663 [INFO] │ File ┆ Size │
2026-01-30 10:56:58,663 [INFO] ╞══════════════════╪════════════╡
2026-01-30 10:56:58,908 [INFO] │ lineitem.parquet ┆ 15.39 mb │
2026-01-30 10:56:58,923 [INFO] │ customer.parquet ┆ 815.82 kb │
2026-01-30 10:56:58,924 [INFO] │ nation.parquet ┆ 2.98 kb │
2026-01-30 10:56:58,980 [INFO] │ orders.parquet ┆ 3.43 mb │
2026-01-30 10:56:58,993 [INFO] │ part.parquet ┆ 403.37 kb │
2026-01-30 10:56:59,035 [INFO] │ partsupp.parquet ┆ 2.85 mb │
2026-01-30 10:56:59,037 [INFO] │ region.parquet ┆ 1.91 kb │
2026-01-30 10:56:59,040 [INFO] │ supplier.parquet ┆ 57.33 kb │
2026-01-30 10:56:59,040 [INFO] └──────────────────┴────────────┘
2026-01-30 10:56:59,040 [INFO] Executing tpch queries for answers
2026-01-30 10:56:59,040 [INFO] ┌────────────────────┬────────────┐
2026-01-30 10:56:59,040 [INFO] │ File ┆ Size │
2026-01-30 10:56:59,040 [INFO] ╞════════════════════╪════════════╡
2026-01-30 10:56:59,046 [INFO] │ result_q1.parquet ┆ 3.54 kb │
2026-01-30 10:56:59,056 [INFO] │ result_q2.parquet ┆ 6.51 kb │
2026-01-30 10:56:59,063 [INFO] │ result_q3.parquet ┆ 1.72 kb │
2026-01-30 10:56:59,070 [INFO] │ result_q4.parquet ┆ 982.00 b │
2026-01-30 10:56:59,080 [INFO] │ result_q5.parquet ┆ 885.00 b │
2026-01-30 10:56:59,082 [INFO] │ result_q6.parquet ┆ 484.00 b │
2026-01-30 10:56:59,090 [INFO] │ result_q7.parquet ┆ 1.55 kb │
2026-01-30 10:56:59,098 [INFO] │ result_q8.parquet ┆ 861.00 b │
2026-01-30 10:56:59,112 [INFO] │ result_q9.parquet ┆ 2.82 kb │
2026-01-30 10:56:59,131 [INFO] │ result_q10.parquet ┆ 5.42 kb │
2026-01-30 10:56:59,137 [INFO] │ result_q11.parquet ┆ 18.85 kb │
2026-01-30 10:56:59,144 [INFO] │ result_q12.parquet ┆ 1.18 kb │
2026-01-30 10:56:59,160 [INFO] │ result_q13.parquet ┆ 1.06 kb │
2026-01-30 10:56:59,164 [INFO] │ result_q14.parquet ┆ 510.00 b │
2026-01-30 10:56:59,167 [INFO] │ result_q15.parquet ┆ 2.13 kb │
2026-01-30 10:56:59,179 [INFO] │ result_q16.parquet ┆ 5.85 kb │
2026-01-30 10:56:59,185 [INFO] │ result_q17.parquet ┆ 497.00 b │
2026-01-30 10:56:59,193 [INFO] │ result_q18.parquet ┆ 2.46 kb │
2026-01-30 10:56:59,201 [INFO] │ result_q19.parquet ┆ 484.00 b │
2026-01-30 10:56:59,207 [INFO] │ result_q20.parquet ┆ 1.35 kb │
2026-01-30 10:56:59,228 [INFO] │ result_q21.parquet ┆ 1.17 kb │
2026-01-30 10:56:59,235 [INFO] │ result_q22.parquet ┆ 1.26 kb │
2026-01-30 10:56:59,235 [INFO] └────────────────────┴────────────┘
2026-01-30 10:56:59,235 [INFO] Finished with total file size: 2.03 kb
================================================================ test session starts =================================================================
platform darwin -- Python 3.12.11, pytest-9.0.2, pluggy-1.6.0
Using --randomly-seed=2068288704
rootdir: /Users/francesco.bruzzesi/Desktop/oss/narwhals-dev
configfile: pyproject.toml
plugins: anyio-4.12.1, hypothesis-6.150.2, xdist-3.8.0, randomly-4.0.1, env-1.2.0, cov-7.0.0
collected 132 items
tests/queries_test.py ..................................................................
There was a problem hiding this comment.
Well I am sure am happy to see our precious logs again 😄
Not fully sure how I feel about it, I might need to sleep on this a bit more
That's okay!
For context, I've had this idea stuck in my head after following links from (duckdb/duckdb#17965):
- CLI: (https://github.com/spiceai/spiceai/tree/c4e18c393dcfade1a4e53bf42e571be2a1be9f9f/tools/testoperator#running-benchmarks)
.shscript: (https://github.com/rapidsai/velox-testing/tree/8f5d358cca8bf419e1d4dc344e1254d2a623686e?tab=readme-ov-file#running-benchmarks)
Zooming out slightly, but still for TPC-H:
Makefile: (https://github.com/pola-rs/polars-benchmark/blob/8975df5acb720c520dfdeb888d3668ba151f41df/Makefile)
Zooming out further, and seeing what our upstream backends do generally for benchmarks:
justfile: (https://github.com/ibis-project/ibis/blob/d70c35edf5c6c04a7ea820cb17b20967c9bdf248/justfile#L231-L237)- CLI: (https://github.com/duckdb/duckdb/blob/c87733c87b41704b22f97ea2ba97c0f7d30c0974/scripts/run_benchmark.py)
- CLI: (https://github.com/apache/arrow/blob/c6090ed5d7047b97c6cabbd8e7728183a549409c/dev/archery/archery/cli.py)
…data need to be generated
…data need to be generated
Removes the parameter from 6 functions
- No need to pass around `scale_factor` - Clear constructors for each entry point - Also who doesn't like method-chaining?
| from tpch.constants import ( | ||
| DATABASE_TABLE_NAMES, | ||
| LOGGER_NAME, | ||
| QUERIES_PACKAGE, | ||
| QUERY_IDS, | ||
| SCALE_FACTOR_DEFAULT, | ||
| _scale_factor_dir, | ||
| ) |
There was a problem hiding this comment.
Was this an intentional change, or lost in the conflicts?
In (134875f) I switched to constants so that the imports didn't take up so many lines 😢
There was a problem hiding this comment.
Ah I see! Sorry feel free to revert it
There was a problem hiding this comment.
I'll keep it for now, this next branch has waaaay more of a negative diff 😄
Double revert #3427 (comment)
Description
Related to #3421 (comment)
This PR adds the possibility to store multiple scale factors in the same data folder.
As a consequence, it adds a parameter to pytest (
--scale-factor, default 0.1) that is possible to pass. If the scale factor data don't exist, they will be created before running the tests.All other capabilities are left the same.
What type of PR is this? (check all applicable)
Related issues
#3421 (comment)
Checklist